# install.packages("tidyverse")
# install.packages("timetk")
library(tidyverse)
library(readr) #read_csv()
library(dplyr)
library(glmnet)
library(timetk)
library(rsample)

1) Load Data

# Load data:
df <- read_csv("./data/walmart_features.csv")
head(df)

Data Preprocessing + Sort Values Notes: Date is already in Date format. Here we just are converting store, holiday_flag from numeric/character to categorical type and dropping redundant columns.

print(sapply(df, class)) # view col datatypes
             store               date               year              month               week              is_q4       weekly_sales       holiday_flag 
         "numeric"             "Date"          "numeric"          "numeric"          "numeric"          "logical"          "numeric"        "character" 
       temperature         fuel_price                cpi       unemployment               lag1               lag2               lag4               lag8 
         "numeric"          "numeric"          "numeric"          "numeric"          "numeric"          "numeric"          "numeric"          "numeric" 
               ma4                ma8 store_mean_to_prev   store_sd_to_prev               sin1               cos1               sin2               cos2 
         "numeric"          "numeric"          "numeric"          "numeric"          "numeric"          "numeric"          "numeric"          "numeric" 
    is_holiday_num inter_holiday_lag1 inter_temp_holiday    inter_cpi_unemp 
         "numeric"          "numeric"          "numeric"          "numeric" 
df <- df %>%
  mutate(
    store = as.factor(store),
    holiday_flag = factor(holiday_flag, levels = unique(holiday_flag))
  ) %>%
  arrange(date, store) %>%  # Sort by date, then store, ascending order
  select(-is_holiday_num) # drop col

head(df)
# Checking the data (uncomment to see output if you wish)
print(dim(df))
[1] 6075   27
#table(df$date) # get value_counts per date -> is 45.

There are 45 stores, so this makes sense, that there are 45 entries for each date. Also, each date represents one week.

2) Train/Test split on sorted data:

Here we perform an 80/20 train/test split to hold back data for comparing different models’ performances. This is not a random split, as we need to preserve time ordering here. Also, it is not a 80/20 split on the row indices, since that could cause some stores with the same date to get split up.

# Train/Test Split:
all_dates <- unique(df$date) # already sorted
cutoff_ix <- floor(0.8 * length(all_dates))
cutoff_date <- all_dates[cutoff_ix]
print(cutoff_date)
[1] "2012-04-20"
train_df <- df %>% filter(date <= cutoff_date)
test_df  <- df %>% filter(date >  cutoff_date)

print(dim(train_df))
[1] 4860   27
print(dim(test_df))
[1] 1215   27
cat("train:", format(min(train_df$date), "%Y-%m-%d"), " to ", format(max(train_df$date), "%Y-%m-%d"),"\n")
train: 2010-04-02  to  2012-04-20 
cat("test:", format(min(test_df$date), "%Y-%m-%d"), " to ", format(max(test_df$date), "%Y-%m-%d"),"\n")
test: 2012-04-27  to  2012-10-26 
print(colnames(df))
 [1] "store"              "date"               "year"               "month"              "week"               "is_q4"              "weekly_sales"      
 [8] "holiday_flag"       "temperature"        "fuel_price"         "cpi"                "unemployment"       "lag1"               "lag2"              
[15] "lag4"               "lag8"               "ma4"                "ma8"                "store_mean_to_prev" "store_sd_to_prev"   "sin1"              
[22] "cos1"               "sin2"               "cos2"               "inter_holiday_lag1" "inter_temp_holiday" "inter_cpi_unemp"   

3) Set Up Rolling Time-series cross validation

Because it is important to keep the future in the test, or else the model could cheat and could use info from the future to predict the past (leakage).

# NOTE - ctrl shift c to mass uncomment/comment out

# Attempt 1 - shorter
# cv_folds <- time_series_cv(
#   data = train_df,
#   date_var = Date,
#   cumulative = FALSE, # no growing window (same size folds)
#   initial = "6 months", # length of train window
#   assess = "3 months", # length of validation window
#   skip = "3 months", # jump between folds
#   slice_limit = 5 # num folds max
# )

# Attempt 2-- longer, cumulative windowing
cv_folds <- time_series_cv(
  data = train_df,
  date_var = date,
  cumulative = TRUE, # growing window (same size folds)
  initial = "1 year", # length of train window
  assess = "3 months", # length of validation window
  skip = "3 months", # jump between folds
  slice_limit = 5 # num folds max
)

plot_time_series_cv_plan(cv_folds, .date_var = date, .value = weekly_sales, .interactive=TRUE)

Additional setup: Define Functions for Standardization Normalize weekly sales within each store, so that no one store dominates, and weekly sales are comparable. i.e. (sales_store - mean(sales_store)) / std(sales_store) and can convert back later via (scaled_sales * store_std) + store_mean

##################### Fit Standard Scaler ###########################
# Returns: mean, std for each col to be used for scaling data.
scaler_fit <- function(df, cols_to_standardize) {
  # For each store, compute mean and std of each col to be standardized
  stats <- df %>%
    group_by(store) %>%
    summarise(
      across(
        all_of(cols_to_standardize),
        list(
          mean = ~mean(.x, na.rm = TRUE),
          sd   = ~sd(.x,   na.rm = TRUE)
        )
      ),
      .groups = "drop"
    )
  # Return a list of std, mean for each col:
  list(
    stats = stats,
    cols  = cols_to_standardize
  )
}

###################### Transform using scaler ############################
# Returns: copy of df, with cols scaled using the scaler. 
#          (Replaces the columns, and no extra cols are added.)
scaler_transform <- function(df, scaler, cols = scaler$cols) {
  stats <- scaler$stats
  
  # join stats onto df by store:
  df2 <- df %>% left_join(stats, by = "store") # create new df2
  
  # For each column, transform via (x - mean) / std, using fitted stats:
  for (col in cols) {
    mean_col <- paste0(col, "_mean") # because stats contains cols with these names
    sd_col   <- paste0(col, "_sd")
    
    df2[[col]] <- (df2[[col]] - df2[[mean_col]]) / df2[[sd_col]] # overwrite col
  }
  
  # remove the helper mean/std columns, so no clutter, and return df2:
  df2 %>% select(-ends_with("_mean"), -ends_with("_sd"))
}

####################### Un-standardize a col (e.g. yhat) #############
scaler_inverse_column <- function(df, scaler, col) {
  stats <- scaler$stats
  
  # Keep only stats for this column, store
  stats_small <- stats %>%
    select(
      store,
      paste0(col, "_mean"),
      paste0(col, "_sd")
    )
  
  df2 <- df %>% left_join(stats_small, by = "store")
  
  mean_col <- paste0(col, "_mean")
  sd_col   <- paste0(col, "_sd")
  
  # Undo z-score: x_raw = x_scaled * sd + mean
  df2[[col]] <- df2[[col]] * df2[[sd_col]] + df2[[mean_col]]
  
  # Drop helper stats
  df2 %>% select(-ends_with("_mean"), -ends_with("_sd"))
}

sales_cols_to_standardize <- c(
  "weekly_sales",
  "lag1", "lag2", "lag4", "lag8",
  "ma4", "ma8",
  "store_mean_to_prev", "store_sd_to_prev",
  "inter_holiday_lag1"
)

Additional setup: Define Cross Validation into one re-usable function Rolling Time-series cross validation for hyperparam selection for lasso: Note: cv_folds contains the full train data. And, design_matrix_formula contains the specification for the features we want for this problem. So, it all checks out.

lasso_cv <- function(cv_folds, lambda_grid, design_matrix_formula,
                     cols_to_standardize=sales_cols_to_standardize) {
  n_folds <- length(cv_folds$splits)
  n_lambdas <- length(lambda_grid)
  cv_mses <- numeric(n_lambdas) # keep track of MSE across different lambdas
  lambda_ix <- 1
  
  for (lambda in lambda_grid) {
    fold_mses <- numeric(n_folds) # array for keeping track of mse across folds
    
    for (fold_ix in 1:n_folds) {
      # Get train, valid data for current cv fold:
      fold <- cv_folds$splits[[fold_ix]]
      train_data <- analysis(fold)
      valid_data <- assessment(fold)
      
      # Standardize cols:
      scaler <- scaler_fit(train_data, cols_to_standardize)
      train_scaled <- scaler_transform(train_data, scaler)
      valid_scaled <- scaler_transform(valid_data, scaler)
      
      # Manipulate data into design matrices:
      X_train <- model.matrix(design_matrix_formula, data=train_scaled)
      y_train <- train_scaled$weekly_sales
      X_val <- model.matrix(design_matrix_formula, data=valid_scaled)
      y_val <- valid_scaled$weekly_sales
      
      # Fit LASSO (non-cv version):
      lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=lambda)
      
      # Get val error:
      yhat_val <- predict(lasso_model, newx=X_val, s=lambda)
      fold_mses[fold_ix] <- mean((y_val - yhat_val)^2)
    }
    cv_mses[lambda_ix] <- mean(fold_mses)
    lambda_ix <- lambda_ix + 1
    
  }
  
  best_lambda <- lambda_grid[which.min(cv_mses)]
  cat("best lambda: ", best_lambda, "with min mse ", min(cv_mses), "\n\n")
  
  plot(x=lambda_grid,
       y=cv_mses,
       main = paste0(n_folds, "-Fold Cross-Validation MSE for different Lambdas"),
       xlab = "Lambda",
       ylab = "Mean CV MSE")
  
  best_lambda
  }

4) Baseline LASSO, no lagged information

# Define design matrix for this test.
design_matrix_formula_baseline_lasso <- weekly_sales ~ holiday_flag + temperature + fuel_price + cpi + unemployment + 0 # 0 means no intercept, glmnet will add its own;

# Define lambda search grid--an easy way to get one with good scaling is from glmnet function itself:
scaler <- scaler_fit(train_df, sales_cols_to_standardize)
train_scaled <- scaler_transform(train_df, scaler)
unused_X_all <- model.matrix(design_matrix_formula_baseline_lasso, data=train_scaled)
unused_y_all <- train_scaled$weekly_sales
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
 [1] 0.1321184830 0.1203814411 0.1096870857 0.0999427873 0.0910641455 0.0829742578 0.0756030534 0.0688866865 0.0627669832 0.0571909374 0.0521102521
[12] 0.0474809210 0.0432628468 0.0394194947 0.0359175755 0.0327267570 0.0298194022 0.0271703286 0.0247565915 0.0225572842 0.0205533572 0.0187274536
[23] 0.0170637582 0.0155478610 0.0141666319 0.0129081073 0.0117613867 0.0107165375 0.0097645098 0.0088970577 0.0081066676 0.0073864936 0.0067302979
[34] 0.0061323968 0.0055876115 0.0050912235 0.0046389333 0.0042268232 0.0038513239 0.0035091829 0.0031974368 0.0029133853 0.0026545682 0.0024187436
[45] 0.0022038691 0.0020080835 0.0018296909 0.0016671462 0.0015190415 0.0013840940 0.0012611349 0.0011490992 0.0010470164 0.0009540023 0.0008692514
[56] 0.0007920295 0.0007216678 0.0006575568 0.0005991413 0.0005459152 0.0004974176 0.0004532284

a) cross val to select lambda

best_lambda <- lasso_cv(cv_folds, lambda_grid, design_matrix_formula_baseline_lasso)
best lambda:  0.007386494 with min mse  1.319609 

b) final model with selected lambda

# Fit model with best lambda on full data, get test error:
test_scaled <- scaler_transform(test_df, scaler)
X_test <- model.matrix(design_matrix_formula_baseline_lasso, data=test_scaled)
y_test_unscaled <- test_df$weekly_sales # not scaled

X_train <- model.matrix(design_matrix_formula_baseline_lasso, data=train_scaled)
y_train <- train_scaled$weekly_sales
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)

# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)

# Un-scale: (need to build a small fake dataframe for this with store id)
pred_df <- test_df %>% 
  select(store) %>%
  mutate(weekly_sales = yhat_test) # note that this does not actually modify test_df
pred_df_unscaled <- scaler_inverse_column(pred_df, scaler, "weekly_sales")
yhat_test_unscaled <- pred_df_unscaled$weekly_sales

mse_test <- mean((y_test_unscaled - yhat_test_unscaled)^2)
cat("Test MSE (unscaled): ", mse_test, "\n\n")
Test MSE (unscaled):  9717192723 
wape_test <- sum(abs(y_test_unscaled - yhat_test_unscaled)) / sum(abs(y_test_unscaled))
cat("Test WAPE:", wape_test, "\n\n")
Test WAPE: 0.06438912 
coef(best_lasso_model)
7 x 1 sparse Matrix of class "dgCMatrix"
                                   s0
(Intercept)              7.055992e-01
holiday_flagNon-Holiday -4.144757e-01
holiday_flagHoliday      4.963938e-14
temperature             -5.327214e-03
fuel_price              -1.905627e-02
cpi                      3.301644e-04
unemployment             .           

5) LASSO WITH time-lagged features

# colnames(train_df)
# Define design matrix for this test.
design_matrix_formula_all_features <- weekly_sales ~ . - store - date + 0 # 0 means no intercept, glmnet will add its own;

# Define lambda search grid--an easy way to get one with good scaling is from glmnet function itself:
scaler <- scaler_fit(train_df, sales_cols_to_standardize)
train_scaled <- scaler_transform(train_df, scaler)
unused_X_all <- model.matrix(design_matrix_formula_all_features, data=train_scaled)
unused_y_all <- train_scaled$weekly_sales
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
 [1] 0.3932540160 0.3583184131 0.3264863928 0.2974822415 0.2710547391 0.2469749831 0.2250344063 0.2050429699 0.1868275175 0.1702302758 0.1551074873
[12] 0.1413281656 0.1287729609 0.1173331260 0.1069095744 0.0974120225 0.0887582069 0.0808731724 0.0736886227 0.0671423285 0.0611775891 0.0557427407
[23] 0.0507907093 0.0462786026 0.0421673391 0.0384213089 0.0350080657 0.0318980457 0.0290643112 0.0264823179 0.0241297018 0.0219860856 0.0200329023
[34] 0.0182532345 0.0166316674 0.0151541560 0.0138079025 0.0125812465 0.0114635632 0.0104451719 0.0095172516 0.0086717652 0.0079013895 0.0071994518
[45] 0.0065598724 0.0059771114 0.0054461213 0.0049623029 0.0045214656 0.0041197910 0.0037538001 0.0034203229 0.0031164708 0.0028396121 0.0025873488
[56] 0.0023574959 0.0021480625 0.0019572345 0.0017833592 0.0016249305 0.0014805761 0.0013490458 0.0012292003 0.0011200015 0.0010205037 0.0009298450
[67] 0.0008472401 0.0007719736 0.0007033936 0.0006409061 0.0005839697 0.0005320915 0.0004848219 0.0004417517 0.0004025077 0.0003667500 0.0003341690

a) cross val to select lambda

best_lambda <- lasso_cv(cv_folds, lambda_grid, design_matrix_formula_all_features)
best lambda:  0.007901389 with min mse  0.6604256 

b) final model with selected lambda

# Fit model with best lambda on full data, get test error:
test_scaled <- scaler_transform(test_df, scaler)
X_test <- model.matrix(design_matrix_formula_all_features, data=test_scaled)
y_test_unscaled <- test_df$weekly_sales # not scaled

X_train <- model.matrix(design_matrix_formula_all_features, data=train_scaled)
y_train <- train_scaled$weekly_sales
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)

# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)

# Un-scale: (need to build a small fake dataframe for this with store id)
pred_df <- test_df %>% 
  select(store) %>%
  mutate(weekly_sales = yhat_test) # note that this does not actually modify test_df
pred_df_unscaled <- scaler_inverse_column(pred_df, scaler, "weekly_sales")
yhat_test_unscaled <- pred_df_unscaled$weekly_sales

mse_test <- mean((y_test_unscaled - yhat_test_unscaled)^2)
cat("Test MSE (unscaled): ", mse_test, "\n\n")
Test MSE (unscaled):  4664649702 
wape_test <- sum(abs(y_test_unscaled - yhat_test_unscaled)) / sum(abs(y_test_unscaled))
cat("Test WAPE:", wape_test, "\n\n")
Test WAPE: 0.04349279 
coef(best_lasso_model)
26 x 1 sparse Matrix of class "dgCMatrix"
                               s0
(Intercept)         -1.888898e+02
year                 9.312715e-02
month                2.009185e-01
week                 8.134132e-06
is_q4FALSE          -2.730022e-01
is_q4TRUE            .           
holiday_flagHoliday  5.465857e+00
temperature          .           
fuel_price           3.304306e-02
cpi                  .           
unemployment        -2.463699e-03
lag1                 2.520160e-01
lag2                 1.157488e-01
lag4                 2.403907e-01
lag8                 7.038497e-02
ma4                  6.324143e-02
ma8                 -5.589519e-02
store_mean_to_prev   .           
store_sd_to_prev    -7.858224e-02
sin1                 8.055230e-01
cos1                 2.309093e-02
sin2                 2.121380e-01
cos2                 1.207982e-01
inter_holiday_lag1  -1.553990e+00
inter_temp_holiday   2.827708e-03
inter_cpi_unemp      .           

Sort coeffs

coefs <- as.matrix(coef(best_lasso_model))[, 1]
coefs_sorted <- sort(abs(coefs), decreasing = TRUE)

# Print sorted coefficients with signs
coef_df <- data.frame(
  feature = names(coefs_sorted),
  coef = coefs[names(coefs_sorted)]
)
coef_df

Disregard Anything below this, is either previous notes or unfinished plots

First CV param attempt notes: The above serves as a baseline, when using no time-averaging or time-lagged features. It could just predict the mean, not finding a strong signal with the other features. Next, we will need to try incorporating time. In the meantime, some things I will try are changing the folds and lambdas used.

second CV param attempt notes: Second attempt, used 1 year initial, and cumulative folds. Now holidays and temperature matter, which is better than the first attempt, which had 0’s for all features and intercept was the only kept feature. MSE stayed the same.

LASSO - baseline, NO lagged variables.

5) Plots to analyze the results

Lasso Path–shows variables entering and leaving as lambda (regularization) increases. (NOT cross val).

fit_path <- glmnet(X_train, y_train, alpha = 1)  # doesnt take best lambda, fits a bunch of lambdas, no CV
plot(fit_path, xvar = "lambda", label = TRUE)
title("LASSO Coefficient Paths")


# attempting to get a mapping to var names but this doesnt include intercept (TODO)
beta_mat <- as.matrix(fit_path$beta)
#rownames(beta_mat)
variable_map <- data.frame(
  index = seq_len(nrow(beta_mat)),
  variable = rownames(beta_mat)
)

print(variable_map)
 # 1) Extract coefficients for each tested lambda value, and convert datatype to R matrix:
 Beta_lambda <- as.matrix(coef(fit_path, s=fit_path$lambda))
 
 # 2) Even tho I didn't specify intercept, there is a intercept row of 0's. 
#Beta_lambda <- Beta_lambda[-1,, drop=FALSE] # all rows except row 1, all columns; dont change dimensions
 # print(Beta_lambda)
 cat('Beta_lambda shape:', dim(Beta_lambda), '\n') # shape: (p x #lambdas
 
 # 3) Get L0, L1 norms:
 l1_norm = colSums(abs(Beta_lambda)) # sum up each betah_lambda col; colSums computes sum of each column
 l0_norm = colSums(Beta_lambda != 0) # number of nonzero coeffs for each lambda
 # 4) Want the plot's x axis to be increasing model complexity (incr L1 norm to the right)
 incr_order_ixs = order(l1_norm) # returns indices to make it incr order
 l1_norm_ordered = l1_norm[incr_order_ixs]
 l0_norm_ordered = l0_norm[incr_order_ixs]
 Beta_lambda_ordered = Beta_lambda[,incr_order_ixs,drop=FALSE]
 # 5) Plot coeff vs. L1 norm:
 # note: currently, Beta_lambda's columns are for each lambda, but matplot plots columns of y
 #       as separate lines. We want a line for each coeff (p). Beta_lambda is (p x #lambdas).
 #       so, we take the transpose of beta.
 #       also, num rows should match for x (#lambdas x 1) and y (p x #lambdas).
 cat('L1 norm shape:', length(l1_norm_ordered), '\n')
 cat('Beta_lambda shape:', dim(Beta_lambda_ordered), '\n')
 
 matplot(x=l1_norm,  y=t(Beta_lambda),
 type="l",# plot type: line
 lwd=2, # line width
 lty=1, # line type (solid, dashed)
 xlab=expression("L1 Norm (" * "||" * hat(beta[lambda]) * "||"[1] * ")"),
 ylab="Coefficients",
 main="Lasso Path")
 abline(h = 0, col = "black", lwd = 1, lty = 2) # show x axis more easily
---
title: "Lasso - Walmart: updated data"
output: html_notebook
---

```{r}
# install.packages("tidyverse")
# install.packages("timetk")
library(tidyverse)
library(readr) #read_csv()
library(dplyr)
library(glmnet)
library(timetk)
library(rsample)
```


### 1) Load Data
```{r}
# Load data:
df <- read_csv("./data/walmart_features.csv")
head(df)
```

**Data Preprocessing + Sort Values**
Notes: Date is already in Date format. Here we just are converting store, holiday_flag from numeric/character to categorical type and dropping redundant columns.
```{r}
print(sapply(df, class)) # view col datatypes

df <- df %>%
  mutate(
    store = as.factor(store),
    holiday_flag = factor(holiday_flag, levels = unique(holiday_flag))
  ) %>%
  arrange(date, store) %>%  # Sort by date, then store, ascending order
  select(-is_holiday_num) # drop col

head(df)
```

```{r}
# Checking the data (uncomment to see output if you wish)
print(dim(df))
#table(df$date) # get value_counts per date -> is 45.
```
**There are 45 stores, so this makes sense, that there are 45 entries for each date. Also, each date represents one week.**

### 2) Train/Test split on sorted data:
Here we perform an 80/20 train/test split to hold back data for comparing different models' performances. This is not a 
random split, as we need to preserve time ordering here. Also, it is not a 80/20 split on the row indices, since that could
cause some stores with the same date to get split up.
```{r}
# Train/Test Split:
all_dates <- unique(df$date) # already sorted
cutoff_ix <- floor(0.8 * length(all_dates))
cutoff_date <- all_dates[cutoff_ix]
print(cutoff_date)

train_df <- df %>% filter(date <= cutoff_date)
test_df  <- df %>% filter(date >  cutoff_date)

print(dim(train_df))
print(dim(test_df))
cat("train:", format(min(train_df$date), "%Y-%m-%d"), " to ", format(max(train_df$date), "%Y-%m-%d"),"\n")
cat("test:", format(min(test_df$date), "%Y-%m-%d"), " to ", format(max(test_df$date), "%Y-%m-%d"),"\n")
print(colnames(df))

```
### 3) Set Up Rolling Time-series cross validation
Because it is important to keep the future in the test, or else the model could cheat and could use info
from the future to predict the past (leakage).
```{r}
# NOTE - ctrl shift c to mass uncomment/comment out

# Attempt 1 - shorter
# cv_folds <- time_series_cv(
#   data = train_df,
#   date_var = Date,
#   cumulative = FALSE, # no growing window (same size folds)
#   initial = "6 months", # length of train window
#   assess = "3 months", # length of validation window
#   skip = "3 months", # jump between folds
#   slice_limit = 5 # num folds max
# )

# Attempt 2-- longer, cumulative windowing
cv_folds <- time_series_cv(
  data = train_df,
  date_var = date,
  cumulative = TRUE, # growing window (same size folds)
  initial = "1 year", # length of train window
  assess = "3 months", # length of validation window
  skip = "3 months", # jump between folds
  slice_limit = 5 # num folds max
)

plot_time_series_cv_plan(cv_folds, .date_var = date, .value = weekly_sales, .interactive=TRUE)
```

**Additional setup: Define Functions for Standardization**
Normalize weekly sales within each store, so that no one store dominates, and weekly sales are comparable.
i.e. (sales_store - mean(sales_store)) / std(sales_store)
and can convert back later via (scaled_sales * store_std) + store_mean
```{r}
##################### Fit Standard Scaler ###########################
# Returns: mean, std for each col to be used for scaling data.
scaler_fit <- function(df, cols_to_standardize) {
  # For each store, compute mean and std of each col to be standardized
  stats <- df %>%
    group_by(store) %>%
    summarise(
      across(
        all_of(cols_to_standardize),
        list(
          mean = ~mean(.x, na.rm = TRUE),
          sd   = ~sd(.x,   na.rm = TRUE)
        )
      ),
      .groups = "drop"
    )
  # Return a list of std, mean for each col:
  list(
    stats = stats,
    cols  = cols_to_standardize
  )
}

###################### Transform using scaler ############################
# Returns: copy of df, with cols scaled using the scaler. 
#          (Replaces the columns, and no extra cols are added.)
scaler_transform <- function(df, scaler, cols = scaler$cols) {
  stats <- scaler$stats
  
  # join stats onto df by store:
  df2 <- df %>% left_join(stats, by = "store") # create new df2
  
  # For each column, transform via (x - mean) / std, using fitted stats:
  for (col in cols) {
    mean_col <- paste0(col, "_mean") # because stats contains cols with these names
    sd_col   <- paste0(col, "_sd")
    
    df2[[col]] <- (df2[[col]] - df2[[mean_col]]) / df2[[sd_col]] # overwrite col
  }
  
  # remove the helper mean/std columns, so no clutter, and return df2:
  df2 %>% select(-ends_with("_mean"), -ends_with("_sd"))
}

####################### Un-standardize a col (e.g. yhat) #############
scaler_inverse_column <- function(df, scaler, col) {
  stats <- scaler$stats
  
  # Keep only stats for this column, store
  stats_small <- stats %>%
    select(
      store,
      paste0(col, "_mean"),
      paste0(col, "_sd")
    )
  
  df2 <- df %>% left_join(stats_small, by = "store")
  
  mean_col <- paste0(col, "_mean")
  sd_col   <- paste0(col, "_sd")
  
  # Undo z-score: x_raw = x_scaled * sd + mean
  df2[[col]] <- df2[[col]] * df2[[sd_col]] + df2[[mean_col]]
  
  # Drop helper stats
  df2 %>% select(-ends_with("_mean"), -ends_with("_sd"))
}

sales_cols_to_standardize <- c(
  "weekly_sales",
  "lag1", "lag2", "lag4", "lag8",
  "ma4", "ma8",
  "store_mean_to_prev", "store_sd_to_prev",
  "inter_holiday_lag1"
)
```

**Additional setup: Define Cross Validation into one re-usable function**
Rolling Time-series cross validation for hyperparam selection for lasso:
Note: cv_folds contains the full train data. And, design_matrix_formula contains the
specification for the features we want for this problem. So, it all checks out.
```{r}
lasso_cv <- function(cv_folds, lambda_grid, design_matrix_formula,
                     cols_to_standardize=sales_cols_to_standardize) {
  n_folds <- length(cv_folds$splits)
  n_lambdas <- length(lambda_grid)
  cv_mses <- numeric(n_lambdas) # keep track of MSE across different lambdas
  lambda_ix <- 1
  
  for (lambda in lambda_grid) {
    fold_mses <- numeric(n_folds) # array for keeping track of mse across folds
    
    for (fold_ix in 1:n_folds) {
      # Get train, valid data for current cv fold:
      fold <- cv_folds$splits[[fold_ix]]
      train_data <- analysis(fold)
      valid_data <- assessment(fold)
      
      # Standardize cols:
      scaler <- scaler_fit(train_data, cols_to_standardize)
      train_scaled <- scaler_transform(train_data, scaler)
      valid_scaled <- scaler_transform(valid_data, scaler)
      
      # Manipulate data into design matrices:
      X_train <- model.matrix(design_matrix_formula, data=train_scaled)
      y_train <- train_scaled$weekly_sales
      X_val <- model.matrix(design_matrix_formula, data=valid_scaled)
      y_val <- valid_scaled$weekly_sales
      
      # Fit LASSO (non-cv version):
      lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=lambda)
      
      # Get val error:
      yhat_val <- predict(lasso_model, newx=X_val, s=lambda)
      fold_mses[fold_ix] <- mean((y_val - yhat_val)^2)
    }
    cv_mses[lambda_ix] <- mean(fold_mses)
    lambda_ix <- lambda_ix + 1
    
  }
  
  best_lambda <- lambda_grid[which.min(cv_mses)]
  cat("best lambda: ", best_lambda, "with min mse ", min(cv_mses), "\n\n")
  
  plot(x=lambda_grid,
       y=cv_mses,
       main = paste0(n_folds, "-Fold Cross-Validation MSE for different Lambdas"),
       xlab = "Lambda",
       ylab = "Mean CV MSE")
  
  best_lambda
  }
```


### 4) Baseline LASSO, no lagged information
```{r}
# Define design matrix for this test.
design_matrix_formula_baseline_lasso <- weekly_sales ~ holiday_flag + temperature + fuel_price + cpi + unemployment + 0 # 0 means no intercept, glmnet will add its own;

# Define lambda search grid--an easy way to get one with good scaling is from glmnet function itself:
scaler <- scaler_fit(train_df, sales_cols_to_standardize)
train_scaled <- scaler_transform(train_df, scaler)
unused_X_all <- model.matrix(design_matrix_formula_baseline_lasso, data=train_scaled)
unused_y_all <- train_scaled$weekly_sales
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
```

#### a) cross val to select lambda

```{r}
best_lambda <- lasso_cv(cv_folds, lambda_grid, design_matrix_formula_baseline_lasso)
```
#### b) final model with selected lambda
```{r}
# Fit model with best lambda on full data, get test error:
test_scaled <- scaler_transform(test_df, scaler)
X_test <- model.matrix(design_matrix_formula_baseline_lasso, data=test_scaled)
y_test_unscaled <- test_df$weekly_sales # not scaled

X_train <- model.matrix(design_matrix_formula_baseline_lasso, data=train_scaled)
y_train <- train_scaled$weekly_sales
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)

# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)

# Un-scale: (need to build a small fake dataframe for this with store id)
pred_df <- test_df %>% 
  select(store) %>%
  mutate(weekly_sales = yhat_test) # note that this does not actually modify test_df
pred_df_unscaled <- scaler_inverse_column(pred_df, scaler, "weekly_sales")
yhat_test_unscaled <- pred_df_unscaled$weekly_sales

mse_test <- mean((y_test_unscaled - yhat_test_unscaled)^2)
cat("Test MSE (unscaled): ", mse_test, "\n\n")

wape_test <- sum(abs(y_test_unscaled - yhat_test_unscaled)) / sum(abs(y_test_unscaled))
cat("Test WAPE:", wape_test, "\n\n")

coef(best_lasso_model)
```
### 5) LASSO WITH time-lagged features
```{r}
# colnames(train_df)
# Define design matrix for this test.
design_matrix_formula_all_features <- weekly_sales ~ . - store - date + 0 # 0 means no intercept, glmnet will add its own;

# Define lambda search grid--an easy way to get one with good scaling is from glmnet function itself:
scaler <- scaler_fit(train_df, sales_cols_to_standardize)
train_scaled <- scaler_transform(train_df, scaler)
unused_X_all <- model.matrix(design_matrix_formula_all_features, data=train_scaled)
unused_y_all <- train_scaled$weekly_sales
unused_model <- glmnet(unused_X_all, unused_y_all, alpha=1)
lambda_grid <- unused_model$lambda
lambda_grid
```
#### a) cross val to select lambda
```{r}
best_lambda <- lasso_cv(cv_folds, lambda_grid, design_matrix_formula_all_features)
```

#### b) final model with selected lambda
```{r}
# Fit model with best lambda on full data, get test error:
test_scaled <- scaler_transform(test_df, scaler)
X_test <- model.matrix(design_matrix_formula_all_features, data=test_scaled)
y_test_unscaled <- test_df$weekly_sales # not scaled

X_train <- model.matrix(design_matrix_formula_all_features, data=train_scaled)
y_train <- train_scaled$weekly_sales
best_lasso_model <- glmnet(X_train, y_train, alpha=1, lambda=best_lambda)

# Get val error:
yhat_test <- predict(best_lasso_model, newx=X_test, s=best_lambda)

# Un-scale: (need to build a small fake dataframe for this with store id)
pred_df <- test_df %>% 
  select(store) %>%
  mutate(weekly_sales = yhat_test) # note that this does not actually modify test_df
pred_df_unscaled <- scaler_inverse_column(pred_df, scaler, "weekly_sales")
yhat_test_unscaled <- pred_df_unscaled$weekly_sales

mse_test <- mean((y_test_unscaled - yhat_test_unscaled)^2)
cat("Test MSE (unscaled): ", mse_test, "\n\n")

wape_test <- sum(abs(y_test_unscaled - yhat_test_unscaled)) / sum(abs(y_test_unscaled))
cat("Test WAPE:", wape_test, "\n\n")

coef(best_lasso_model)
```
**Sort coeffs**
```{r}
coefs <- as.matrix(coef(best_lasso_model))[, 1]
coefs_sorted <- sort(abs(coefs), decreasing = TRUE)

# Print sorted coefficients with signs
coef_df <- data.frame(
  feature = names(coefs_sorted),
  coef = coefs[names(coefs_sorted)]
)
coef_df
```


###########################################################################################################
###########################################################################################################
## Disregard Anything below this, is either previous notes or unfinished plots
###########################################################################################################
###########################################################################################################
###########################################################################################################
###########################################################################################################
###########################################################################################################
###########################################################################################################
###########################################################################################################
###########################################################################################################
###########################################################################################################


**First CV param attempt notes:**
The above serves as a baseline, when using no time-averaging or time-lagged features. It could just predict the mean, not finding a strong signal with the other features. Next, we will need to try incorporating time. In the meantime, some things I will try are changing the folds and lambdas used.

**second CV param attempt notes:**
Second attempt, used 1 year initial, and cumulative folds. Now holidays and temperature matter, which is better than the first attempt, which had 0's for all features and intercept was the only kept feature. MSE stayed the same.

LASSO - baseline, NO lagged variables.

### 5) Plots to analyze the results
**Lasso Path--shows variables entering and leaving as lambda (regularization) increases. (NOT cross val).**
```{r}
fit_path <- glmnet(X_train, y_train, alpha = 1)  # doesnt take best lambda, fits a bunch of lambdas, no CV
plot(fit_path, xvar = "lambda", label = TRUE)
title("LASSO Coefficient Paths")


# attempting to get a mapping to var names but this doesnt include intercept (TODO)
beta_mat <- as.matrix(fit_path$beta)
#rownames(beta_mat)
variable_map <- data.frame(
  index = seq_len(nrow(beta_mat)),
  variable = rownames(beta_mat)
)

print(variable_map)
```
```{r}
 # 1) Extract coefficients for each tested lambda value, and convert datatype to R matrix:
 Beta_lambda <- as.matrix(coef(fit_path, s=fit_path$lambda))
 
 # 2) Even tho I didn't specify intercept, there is a intercept row of 0's. 
#Beta_lambda <- Beta_lambda[-1,, drop=FALSE] # all rows except row 1, all columns; dont change dimensions
 # print(Beta_lambda)
 cat('Beta_lambda shape:', dim(Beta_lambda), '\n') # shape: (p x #lambdas
 
 # 3) Get L0, L1 norms:
 l1_norm = colSums(abs(Beta_lambda)) # sum up each betah_lambda col; colSums computes sum of each column
 l0_norm = colSums(Beta_lambda != 0) # number of nonzero coeffs for each lambda
 # 4) Want the plot's x axis to be increasing model complexity (incr L1 norm to the right)
 incr_order_ixs = order(l1_norm) # returns indices to make it incr order
 l1_norm_ordered = l1_norm[incr_order_ixs]
 l0_norm_ordered = l0_norm[incr_order_ixs]
 Beta_lambda_ordered = Beta_lambda[,incr_order_ixs,drop=FALSE]
 # 5) Plot coeff vs. L1 norm:
 # note: currently, Beta_lambda's columns are for each lambda, but matplot plots columns of y
 #       as separate lines. We want a line for each coeff (p). Beta_lambda is (p x #lambdas).
 #       so, we take the transpose of beta.
 #       also, num rows should match for x (#lambdas x 1) and y (p x #lambdas).
 cat('L1 norm shape:', length(l1_norm_ordered), '\n')
 cat('Beta_lambda shape:', dim(Beta_lambda_ordered), '\n')
 
 matplot(x=l1_norm,  y=t(Beta_lambda),
 type="l",# plot type: line
 lwd=2, # line width
 lty=1, # line type (solid, dashed)
 xlab=expression("L1 Norm (" * "||" * hat(beta[lambda]) * "||"[1] * ")"),
 ylab="Coefficients",
 main="Lasso Path")
 abline(h = 0, col = "black", lwd = 1, lty = 2) # show x axis more easily
```

